*01_data_cleaning_1910_1920.do

clear all
capture log close

global ipums   "/disk/data3/census-ipums/v2019/dta"
global root    "/disk/bulkw/nencka/schooling_pandemic/2021_10_18_final/"
global tempdir "$root/Temp"
global logdir  "$root/Log"

log using $logdir/01_data_cleaning_1910_1920.log, text replace


*NOTE: momloc and poploc contain the pernum of the mother and father of each child.

foreach yearval in 1910 1920 {
di `yearval'

use $ipums/`yearval', clear

unab allvars: _all
local masterlist "pernum year histid serial urban age sex birthmo agemonth bpl relate namelast namefrst school schlmnth lit bplstr fbplstr mbplstr bpl fbpl mbpl momloc poploc sploc racesing race county stcounty city stdcity stdmcd enumdist reel pageno line occstr occ1950 occscore ind1950"
local keeplist: list allvars & masterlist
keep `keeplist'



di "First make a dataset of parents, containing all variables we might want to use to describe each child's household circumstances"

preserve

	keep serial pernum age sex bpl bplstr namelast namefrst racesing occ1950 occscore ind1950 race sploc

	ds serial pernum, not 
	foreach v of var `r(varlist)' { 
		rename `v' `v'_parent
	} 

	duplicates report serial pernum
	duplicates tag serial pernum, gen(temp)
	list if temp>0
	drop if temp>0
	drop temp

	tempfile parents
	save "`parents'"

*Restore full dataset
restore


di "Now make a dataset of ALL people and merge on mother and father attributes"

foreach var of varlist occ1950 occscore ind1950 sploc {
	rename `var' `var'_base
}

rename pernum pernum_base

*Mom merge
rename momloc pernum
merge m:1 serial pernum using "`parents'"
drop if _merge==2
drop _merge
rename pernum pernum_m
rename *_parent *_m

*Dad merge
rename poploc pernum
merge m:1 serial pernum using "`parents'"
drop if _merge==2
drop _merge 
rename pernum pernum_f
rename *_parent *_f


*Construct birth order variable within household

tostring year, gen(yearstring)
tostring serial, gen(serialstring)

gen hhid=yearstring+"hh"+serialstring

egen long temp = group(hhid)
drop hhid
rename temp hhid

gen birthyr=year-age
destring relate, replace
tab relate, m
replace birthyr = . if !inrange(relate,301,304)
bys hhid (birthyr): gen birth_order=_n
replace birth_order=. if missing(birthyr)
tab relate birth_order if inrange(birth_order,1,10) | missing(birth_order), m

capture drop birthyr
gen birthyr=year-age

list in 1/10
drop hhid


gen statefip=trunc(stcounty/10000)
rename stdcity mcd 
replace mcd = lower(mcd)

bys histid: gen numobs = _N
tab numobs, m
drop if numobs>1
drop numobs

*Save final file of all children

compress
di _N
ds

save $tempdir/data`yearval'_1918pandemic, replace

}

log close

